Load Package

library(tidyverse)
library(ggplot2)
library(ggcorrplot)
library(corrplot)
library(leaps)
library(car)
library(Metrics)
library(reshape2)
library(ggpubr)
library(moments)
library(naniar)
library(VIM)
library(plotly)

Part 1. Load data

data_input<-read_csv("Life Expectancy Data.csv")
str(data_input)
## spc_tbl_ [2,938 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ country                        : chr [1:2938] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ year                           : num [1:2938] 2015 2014 2013 2012 2011 ...
##  $ status                         : chr [1:2938] "Developing" "Developing" "Developing" "Developing" ...
##  $ life_expectancy                : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ adult_mortality                : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
##  $ infant_mortality               : num [1:2938] 62 64 66 69 71 74 77 80 82 84 ...
##  $ alcohol                        : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percent_expenditure            : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
##  $ hepatitis_B                    : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
##  $ measles                        : num [1:2938] 1154 492 430 2787 3013 ...
##  $ BMI                            : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under_5_mortality              : num [1:2938] 83 86 89 93 97 102 106 110 113 116 ...
##  $ polio                          : num [1:2938] 6 58 62 67 68 66 63 64 63 58 ...
##  $ total_expenditure              : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
##  $ diphtheria                     : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
##  $ HIV/AIDS_mortality             : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
##  $ population                     : num [1:2938] 33736494 327582 31731688 3696958 2978599 ...
##  $ thinness_1-19                  : num [1:2938] 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
##  $ thinness_5-9                   : num [1:2938] 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
##  $ income_composition_of_resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
##  $ schooling                      : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   country = col_character(),
##   ..   year = col_double(),
##   ..   status = col_character(),
##   ..   life_expectancy = col_double(),
##   ..   adult_mortality = col_double(),
##   ..   infant_mortality = col_double(),
##   ..   alcohol = col_double(),
##   ..   percent_expenditure = col_double(),
##   ..   hepatitis_B = col_double(),
##   ..   measles = col_double(),
##   ..   BMI = col_double(),
##   ..   under_5_mortality = col_double(),
##   ..   polio = col_double(),
##   ..   total_expenditure = col_double(),
##   ..   diphtheria = col_double(),
##   ..   `HIV/AIDS_mortality` = col_double(),
##   ..   GDP = col_double(),
##   ..   population = col_double(),
##   ..   `thinness_1-19` = col_double(),
##   ..   `thinness_5-9` = col_double(),
##   ..   income_composition_of_resources = col_double(),
##   ..   schooling = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(data_input)
## # A tibble: 6 × 22
##   country    year status life_…¹ adult…² infan…³ alcohol perce…⁴ hepat…⁵ measles
##   <chr>     <dbl> <chr>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 Afghanis…  2015 Devel…    65       263      62    0.01   71.3       65    1154
## 2 Afghanis…  2014 Devel…    59.9     271      64    0.01   73.5       62     492
## 3 Afghanis…  2013 Devel…    59.9     268      66    0.01   73.2       64     430
## 4 Afghanis…  2012 Devel…    59.5     272      69    0.01   78.2       67    2787
## 5 Afghanis…  2011 Devel…    59.2     275      71    0.01    7.10      68    3013
## 6 Afghanis…  2010 Devel…    58.8     279      74    0.01   79.7       66    1989
## # … with 12 more variables: BMI <dbl>, under_5_mortality <dbl>, polio <dbl>,
## #   total_expenditure <dbl>, diphtheria <dbl>, `HIV/AIDS_mortality` <dbl>,
## #   GDP <dbl>, population <dbl>, `thinness_1-19` <dbl>, `thinness_5-9` <dbl>,
## #   income_composition_of_resources <dbl>, schooling <dbl>, and abbreviated
## #   variable names ¹​life_expectancy, ²​adult_mortality, ³​infant_mortality,
## #   ⁴​percent_expenditure, ⁵​hepatitis_B

Part 2. Clean and filter data

2.1 Missing data

missing.rows = dim(data_input)[1] -  dim(na.omit(data_input))[1]

missings_df <- data.frame(type=c("missing", "non-missing") ,count = c(missing.rows,  dim(na.omit(data_input))[1]))

ggplot(missings_df, aes(fill=type, y="", x=count)) +
    geom_bar(position="stack", stat="identity")+
    ggtitle("Missing vs Non-missing row counts") +
    xlab("Missing count") + ylab("") 

vis_miss(data_input)

data_fill <- kNN(data_input)
data_fill <- subset(data_fill, select = country:schooling)
vis_miss(data_fill)

2.2 Remove unnecessary variables and factorize categorical variables:

data<-data_fill[-c(1,2,3)]
glimpse(data)
## Rows: 2,938
## Columns: 19
## $ life_expectancy                 <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8, 58…
## $ adult_mortality                 <dbl> 263, 271, 268, 272, 275, 279, 281, 287…
## $ infant_mortality                <dbl> 62, 64, 66, 69, 71, 74, 77, 80, 82, 84…
## $ alcohol                         <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.…
## $ percent_expenditure             <dbl> 71.279624, 73.523582, 73.219243, 78.18…
## $ hepatitis_B                     <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63, 64…
## $ measles                         <dbl> 1154, 492, 430, 2787, 3013, 1989, 2861…
## $ BMI                             <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7, 16…
## $ under_5_mortality               <dbl> 83, 86, 89, 93, 97, 102, 106, 110, 113…
## $ polio                           <dbl> 6, 58, 62, 67, 68, 66, 63, 64, 63, 58,…
## $ total_expenditure               <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20, 9.…
## $ diphtheria                      <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63, 58…
## $ `HIV/AIDS_mortality`            <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1…
## $ GDP                             <dbl> 584.25921, 612.69651, 631.74498, 669.9…
## $ population                      <dbl> 33736494, 327582, 31731688, 3696958, 2…
## $ `thinness_1-19`                 <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4, 18…
## $ `thinness_5-9`                  <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4, 18…
## $ income_composition_of_resources <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, 0.4…
## $ schooling                       <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9, 8…

2.3 Data transformation

ggplot(data, aes(x=life_expectancy)) + 
    geom_density(alpha=.3, fill="red", color="red", size=1.5)+
    geom_vline(aes(xintercept=mean(life_expectancy)), size=1)+
    ggtitle("Distribution density of Life.expectancy") +
    theme(text = element_text(size = 18))

sprintf("Skewness: [%s]", toString(skewness(data$life_expectancy, na.rm = TRUE)))
## [1] "Skewness: [-0.640978390586901]"
data$life_expectancy <- sqrt(max(data$life_expectancy+1)- data$life_expectancy)

ggplot(data, aes(x=life_expectancy)) +
    geom_density(alpha=.3, fill="red", color="red", size=1.5)+
    geom_vline(aes(xintercept=mean(life_expectancy)))+
    ggtitle("Distribution density of Life.expectancy") +
    theme(text = element_text(size = 18))

Part 3. Exploratory Data Analysis (EDA)

3.1 Correlation and collinearity

corr <- round(cor(data), 3)
ggcorrplot(corr,type = "upper", lab = TRUE, , outline.color = "black", lab_size = 1, legend.title = "Correlation")+
ggtitle("Correlation Matrix")

model_linear <- lm(life_expectancy~ ., data = data)
vifs <- data.frame(vif(model_linear))

ggplot(vifs, aes(y=vif.model_linear., x=row.names(vifs))) +
    geom_bar(aes(fill=vif.model_linear.>5),stat="identity")+
    scale_y_continuous(trans = "sqrt",  breaks = c(5, 10, 50, 100))+
    geom_hline(yintercept = 5, colour = "red") +
    ggtitle("VIF per feature") +
    xlab("Featurs") + ylab("VIF") +
    theme(axis.text.x=element_text(angle=40, hjust=1))

data_EDA <- subset(data, select = -c(infant_mortality, `thinness_5-9`
, GDP))

model_EDA <- lm(life_expectancy~ ., data = data_EDA)
vifs_EDA <- data.frame(vif(model_EDA))


ggplot(vifs_EDA, aes(y=vif.model_EDA., x=row.names(vifs_EDA))) +
    geom_bar(aes(fill=vif.model_EDA.>5),stat="identity")+
    scale_y_continuous(trans = "sqrt",  breaks = c(5, 10, 50, 100))+
    geom_hline(yintercept = 5, colour = "red") +
    ggtitle("VIF per feature") +
    xlab("Featurs") + ylab("VIF") +
    theme(axis.text.x=element_text(angle=40, hjust=1))

Part 4. Data visualization

data_visu <- read.csv("data_Visu.csv")
data_visu <- na.omit(data_visu)
g <- list(
showframe = TRUE,
showcoastlines = TRUE,
projection = list(type = 'Mercator')
)

plot <- plot_geo(data_fill, locationmode="country names")%>%
    add_trace(locations=~country,
             z=~life_expectancy,
             frame=~year) %>%
    layout(
  title = with(data, paste('Mean Life Expectancy')),
  geo = g
)
plot
fig1<-plot_ly(data_visu,x=~schooling,y=~life_expectancy,size=~population,frame = ~year,text=~country,color = ~Continent,log_x=TRUE,size_max=60)
fig1
fig2<-plot_ly(data_visu,x=~percent_expenditure,y=~life_expectancy,size=~population,frame = ~year,text=~country,color = ~Continent,log_x=TRUE,size_max=60)
fig2
ggplot(data_visu,aes(x=GDP,y=life_expectancy))+geom_point(color="red")+geom_smooth(method="lm")+facet_wrap(~Continent)

ggplot(data_visu,aes(x=income_composition_of_resources,y=life_expectancy))+geom_point(color="blue")+geom_smooth(method="lm")+facet_wrap(~Continent)

Part 5. Models

1. Model (full)

Sử dụng data đầy đủ để tạo model với tất cả cột

set.seed(20133076)
sample_full<-sample(c(TRUE,FALSE),nrow(data),replace=TRUE, prob = c(0.7,0.3))

train_full<-data[sample_full,]
x_test_full<-data[!sample_full,]
y_test_full<-data[!sample_full,]$life_expectancy

model_full<-lm(life_expectancy~.,data=train_full)

summary(model_full)
## 
## Call:
## lm(formula = life_expectancy ~ ., data = train_full)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.48480 -0.22630  0.03941  0.27688  1.89028 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      6.290e+00  7.865e-02  79.978  < 2e-16 ***
## adult_mortality                  1.928e-03  1.108e-04  17.404  < 2e-16 ***
## infant_mortality                -7.091e-03  1.181e-03  -6.003 2.28e-09 ***
## alcohol                         -8.177e-03  3.339e-03  -2.449 0.014403 *  
## percent_expenditure             -3.921e-05  1.117e-05  -3.510 0.000457 ***
## hepatitis_B                      1.360e-03  5.502e-04   2.472 0.013505 *  
## measles                          1.390e-06  1.196e-06   1.162 0.245478    
## BMI                             -2.836e-03  6.806e-04  -4.166 3.23e-05 ***
## under_5_mortality                5.284e-03  8.666e-04   6.098 1.28e-09 ***
## polio                           -2.126e-03  6.229e-04  -3.414 0.000653 ***
## total_expenditure               -1.794e-02  4.623e-03  -3.881 0.000107 ***
## diphtheria                      -3.982e-03  6.699e-04  -5.944 3.25e-09 ***
## `HIV/AIDS_mortality`             4.225e-02  2.326e-03  18.161  < 2e-16 ***
## GDP                             -5.224e-06  1.702e-06  -3.068 0.002180 ** 
## population                       2.252e-10  2.619e-10   0.860 0.390010    
## `thinness_1-19`                 -1.107e-03  7.179e-03  -0.154 0.877462    
## `thinness_5-9`                   1.272e-02  7.097e-03   1.792 0.073239 .  
## income_composition_of_resources -9.451e-01  8.887e-02 -10.635  < 2e-16 ***
## schooling                       -8.426e-02  5.975e-03 -14.101  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4738 on 2064 degrees of freedom
## Multiple R-squared:  0.8038, Adjusted R-squared:  0.8021 
## F-statistic: 469.7 on 18 and 2064 DF,  p-value: < 2.2e-16

Kiểm tra

pred_full<-predict(model_full, newdata = x_test_full)
value_model <- data.frame("Full", rmse(pred_full,y_test_full), summary(model_full)$adj.r.squared,length(coef(model_full))-1)

colnames(value_model) = c("Model", "Root Mean Squared Error", "R squared", "Number of variable")

value_model
##   Model Root Mean Squared Error R squared Number of variable
## 1  Full                0.481023 0.8020737                 18
par(mfrow=c(2,2))
plot(model_full)

pred_acct_full <- data.frame(pred_full, y_test_full)
ggplot(pred_acct_full,aes(x=pred_full, y= y_test_full)) +
  geom_point() +
  geom_abline(intercept=0, slope=1) +
  labs(x='Predicted Values', y='Actual Values', title='Predicted vs. Actual Values')

2. Model (AIC)

set.seed(20133082)
sample_step<-sample(c(TRUE,FALSE),nrow(data),replace=TRUE, prob = c(0.7,0.3))

train_step<-data[sample_step,]
x_test_step<-data[!sample_step,]
y_test_step<-data[!sample_step,]$life_expectancy

model_full_step<-lm(life_expectancy~.,data=train_step)
model_step<-step(model_full_step,direction = "both",test="F")
summary(model_step)
## 
## Call:
## lm(formula = life_expectancy ~ adult_mortality + infant_mortality + 
##     alcohol + percent_expenditure + hepatitis_B + measles + BMI + 
##     under_5_mortality + polio + total_expenditure + diphtheria + 
##     `HIV/AIDS_mortality` + GDP + `thinness_5-9` + income_composition_of_resources + 
##     schooling, data = train_step)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.40177 -0.22772  0.03268  0.28336  1.91790 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      6.147e+00  7.926e-02  77.565  < 2e-16 ***
## adult_mortality                  2.030e-03  1.136e-04  17.874  < 2e-16 ***
## infant_mortality                -7.433e-03  1.332e-03  -5.579 2.74e-08 ***
## alcohol                         -9.462e-03  3.401e-03  -2.782 0.005448 ** 
## percent_expenditure             -3.024e-05  1.126e-05  -2.685 0.007312 ** 
## hepatitis_B                      1.085e-03  5.700e-04   1.904 0.057102 .  
## measles                          1.849e-06  1.272e-06   1.453 0.146269    
## BMI                             -3.781e-03  7.030e-04  -5.378 8.42e-08 ***
## under_5_mortality                5.577e-03  9.931e-04   5.615 2.23e-08 ***
## polio                           -2.596e-03  6.556e-04  -3.959 7.79e-05 ***
## total_expenditure               -1.403e-02  4.691e-03  -2.992 0.002807 ** 
## diphtheria                      -3.400e-03  6.941e-04  -4.899 1.04e-06 ***
## `HIV/AIDS_mortality`             4.056e-02  2.377e-03  17.064  < 2e-16 ***
## GDP                             -6.255e-06  1.750e-06  -3.575 0.000359 ***
## `thinness_5-9`                   1.198e-02  3.333e-03   3.595 0.000332 ***
## income_composition_of_resources -8.372e-01  8.923e-02  -9.383  < 2e-16 ***
## schooling                       -7.658e-02  5.937e-03 -12.899  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4764 on 2017 degrees of freedom
## Multiple R-squared:  0.7986, Adjusted R-squared:  0.797 
## F-statistic:   500 on 16 and 2017 DF,  p-value: < 2.2e-16

Kiểm tra

pred_step<-predict(model_step, newdata = x_test_step)
value_model[nrow(value_model) + 1,] = c("AIC",rmse(y_test_step,pred_step), summary(model_step)$adj.r.squared, length(coef(model_step))-1)
value_model
##   Model Root Mean Squared Error         R squared Number of variable
## 1  Full       0.481022999289545 0.802073745562233                 18
## 2   AIC       0.471017271177039 0.797046965097611                 16
par(mfrow=c(2,2))
plot(model_step)

pred_acct_step <- data.frame(pred_step, y_test_step)
ggplot(pred_acct_step,aes(x=pred_step, y= y_test_step)) +
  geom_point() +
  geom_abline(intercept=0, slope=1) +
  labs(x='Predicted Values', y='Actual Values', title='Predicted vs. Actual Values')

3. Feature selection methods

Lựa chọn theo 3 phương pháp:

theo đó dựa trên:

3.1 Best subset

set.seed(20133035)
regfit.best <- regsubsets(life_expectancy~., data= data_EDA, nvmax = 18)
reg.summary <- summary(regfit.best)

par(mfrow=c(2,2))

#- residual sum of squares:
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
which.min(reg.summary$rss)
## [1] 15
points(15,reg.summary$rss[15], col="red",cex=2,pch=20)

# adjusted-R^2 with its largest value
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted Rsq",type="l")
which.max(reg.summary$adjr2)
## [1] 14
points(14,reg.summary$adjr2[14], col="red",cex=2,pch=20)

# Mallow's Cp with its smallest value
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
which.min(reg.summary$cp)
## [1] 14
points(14,reg.summary$cp[14],col="red",cex=2,pch=20)

# BIC with its smallest value
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
which.min(reg.summary$bic)
## [1] 10
points(10,reg.summary$bic[15],col="red",cex=2,pch=20)

Có thể thấy BIC cho số biến ít nhất nên từ đây chỉ xét theo BIC.

3.2 Forward inclusion

par(mfrow=c(1,1))
set.seed(20133013)
regfit.fwd <- regsubsets(life_expectancy~.,data=data_EDA,nvmax=18,method="forward")
fwd.summary <-summary(regfit.fwd)

# fwd.summary
#set_plot_dimensions(8,6)
plot(fwd.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
which.min(fwd.summary$bic)
## [1] 10
points(10,fwd.summary$bic[10],col="red",cex=2,pch=20)

3.3 Backward elimination

set.seed(20133013)
regfit.bwd <- regsubsets(life_expectancy~.,data=data_EDA,nvmax=18,method="backward")
bwd.summary <- summary(regfit.bwd)

# fwd.summary
#set_plot_dimensions(8,6)
plot(bwd.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
which.min(bwd.summary$bic)
## [1] 10
points(10,bwd.summary$bic[10],col="red",cex=2,pch=20)

Tất cả các phương pháp với BIC đều cho ra kết quả là 10 biến.

v_names <- rownames(as.data.frame(coef(regfit.best,10)))
coefs<- data.frame(v_names)
coefs$best_coef_value<- coef(regfit.best,10)
coefs$fwd_coef_value <-  coef(regfit.fwd,10)
coefs$bwd_coef_value <-  coef(regfit.bwd,10)

#set_plot_dimensions(18,4)
ggplot(coefs,
       aes(x=v_names, y=best_coef_value, fill=best_coef_value)) +
                                  geom_bar(stat="identity") +
                                  ggtitle("Features & coeffecients: [method Best]") +
                                  xlab("Feature") + ylab("Coef value") +
                                  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplot(coefs,
       aes(x=v_names, y=fwd_coef_value, fill=fwd_coef_value)) +
                                  geom_bar(stat="identity") +
                                  ggtitle("Features & coeffecients: [method Forward inclusion]") +
                                  xlab("Feature") + ylab("Coef value") +
                                  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplot(coefs,
       aes(x=v_names, y=bwd_coef_value, fill=bwd_coef_value)) +
                                  geom_bar(stat="identity") +
                                  ggtitle("Feature & coeffecients: [method Backward elimination]") +
                                  xlab("Feature") + ylab("Coef value") +
                                  theme(axis.text.x=element_text(angle=45, hjust=1))

3 phương pháp đã chọn ra 10 biến. Tạo data:

data_select<-subset(data, select = c(life_expectancy, adult_mortality, percent_expenditure, BMI, polio, total_expenditure, diphtheria, `HIV/AIDS_mortality`, `thinness_1-19`,income_composition_of_resources,  schooling ))

3.4 Final model

set.seed(20133077)
sample_select<-sample(c(TRUE, FALSE), nrow(data_select), replace=TRUE, prob=c(0.70,0.30))

train_select<-data_select[sample_select,]
x_select_test<-data_select[!sample_select,]
y_select_test<-data_select[!sample_select,]$life_expectancy


model_select<-lm(life_expectancy~.,data=train_select)
summary(model_select)
## 
## Call:
## lm(formula = life_expectancy ~ ., data = train_select)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.39439 -0.23390  0.03601  0.28103  2.07112 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      6.302e+00  7.485e-02  84.203  < 2e-16 ***
## adult_mortality                  2.000e-03  1.114e-04  17.948  < 2e-16 ***
## percent_expenditure             -6.365e-05  5.901e-06 -10.787  < 2e-16 ***
## BMI                             -3.496e-03  6.828e-04  -5.121 3.33e-07 ***
## polio                           -2.464e-03  6.475e-04  -3.805 0.000146 ***
## total_expenditure               -9.900e-03  4.609e-03  -2.148 0.031841 *  
## diphtheria                      -3.181e-03  6.496e-04  -4.898 1.05e-06 ***
## `HIV/AIDS_mortality`             4.378e-02  2.382e-03  18.382  < 2e-16 ***
## `thinness_1-19`                  1.278e-02  2.908e-03   4.396 1.16e-05 ***
## income_composition_of_resources -9.589e-01  8.995e-02 -10.661  < 2e-16 ***
## schooling                       -8.606e-02  5.841e-03 -14.733  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4808 on 2073 degrees of freedom
## Multiple R-squared:  0.7898, Adjusted R-squared:  0.7888 
## F-statistic: 779.1 on 10 and 2073 DF,  p-value: < 2.2e-16

Kiểm tra

pred_select<-predict(model_select, newdata = x_select_test)

value_model[nrow(value_model) + 1,] = c("BIC",rmse(y_select_test,pred_select), summary(model_select)$adj.r.squared, length(coef(model_select))-1)
value_model
##   Model Root Mean Squared Error         R squared Number of variable
## 1  Full       0.481022999289545 0.802073745562233                 18
## 2   AIC       0.471017271177039 0.797046965097611                 16
## 3   BIC         0.4841986829211 0.788832518734723                 10
par(mfrow=c(2,2))
plot(model_select)

pred_acct_select <- data.frame(pred_select, y_select_test)
ggplot(pred_acct_select,aes(x=pred_select, y= y_select_test)) +
  geom_point() +
  geom_abline(intercept=0, slope=1) +
  labs(x='Predicted Values', y='Actual Values', title='Predicted vs. Actual Values')